{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 数据处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "将数据按照主题进行整理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df1 = pd.read_csv(\"./data/dev.csv\")\n",
    "df2 = pd.read_csv(\"./data/train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3 = pd.read_csv(\"./data/test_dataset.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'content', 'picture_lists', 'category', 'comment_2',\n",
       "       'comment_all'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df3.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df3['comment_2c'] = df3['comment_2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "del df3['comment_2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'content', 'picture_lists', 'category', 'ncw_label', 'fake_label',\n",
       "       'real_label', 'comment_2c', 'comment_all'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([\n",
    "#                 df1[['id', 'content', 'picture_lists', 'category', 'comment_2c', 'comment_all']], \n",
    "                df2[['id', 'content', 'picture_lists', 'category', 'comment_2c','comment_all']],\n",
    "                df3\n",
    "               ], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'政治': 0,\n",
       " '疫情': 1,\n",
       " '财经商业': 2,\n",
       " '医药健康': 3,\n",
       " '教育考试': 4,\n",
       " '军事': 5,\n",
       " '文体娱乐': 6,\n",
       " '社会生活': 7,\n",
       " '科技': 8}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dic = {item:idx for idx, item in enumerate(set(df['category'].values.tolist()))}\n",
    "dic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"cat_id\"] = df.apply(lambda x: dic[x['category']], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sample(frac=1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[['id', 'content', 'picture_lists', 'category', \"cat_id\", 'comment_2c', 'comment_all']].iloc[:50000].to_csv(\"./data/topic_train.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[['id', 'content', 'picture_lists', 'category', \"cat_id\", 'comment_2c', 'comment_all']].to_csv(\"./data/adver_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'df1' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-15-6429f6c452dd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;31m# del df1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mdf2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'df1' is not defined"
     ]
    }
   ],
   "source": [
    "df = pd.concat([df1, df2], axis=0, ignore_index=True)\n",
    "del df1\n",
    "del df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'社会生活': 0,\n",
       " '军事': 1,\n",
       " '科技': 2,\n",
       " '医药健康': 3,\n",
       " '文体娱乐': 4,\n",
       " '财经商业': 5,\n",
       " '政治': 6,\n",
       " '教育考试': 7}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dic = {item:idx for idx, item in enumerate(set(df['category'].values.tolist()))}\n",
    "dic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[\"cat_id\"] = df.apply(lambda x: dic[x['category']], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev = pd.concat([df[df.cat_id==2], df[df.cat_id==5]], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "dev.to_csv(\"./data/dev_1_3.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "l = [0, 1, 3, 4, 6, 7]\n",
    "train = pd.concat([df[df.cat_id==i] for i in l], axis=0, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv(\"./data/train_1_3.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 数据基础分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"./data/train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>category</th>\n",
       "      <th>ncw_label</th>\n",
       "      <th>fake_label</th>\n",
       "      <th>real_label</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>4028762460708675</td>\n",
       "      <td>回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：</td>\n",
       "      <td>NaN</td>\n",
       "      <td>文体娱乐</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4126670854660291</td>\n",
       "      <td>//分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....</td>\n",
       "      <td>63ad082a189566eed7c4bb3e4bc55012.jpg</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3702441948299962</td>\n",
       "      <td>西宁城管围殴民警扬言要把警察打死|西宁城管围...</td>\n",
       "      <td>4986dc2a5f09a87c7af5dfc57d7775cd.jpg</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4239549419245466</td>\n",
       "      <td>【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...</td>\n",
       "      <td>dcfccfc69e90a0007afd6aafa1385e56.jpg</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3966337217183260</td>\n",
       "      <td>支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】</td>\n",
       "      <td>NaN</td>\n",
       "      <td>社会生活</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id                                            content  \\\n",
       "0  4028762460708675             回复新浪网友对博文【国家文物局限制鉴宝节目现场估价转】的评论：;;查看原文：   \n",
       "1  4126670854660291  //分享网易新闻:《发生在昆明的火锅店老板“辱滇门”，云南人该愤怒还是羞愧》|发生在昆明.....   \n",
       "2  3702441948299962                          西宁城管围殴民警扬言要把警察打死|西宁城管围...   \n",
       "3  4239549419245466  【川航航班因驾驶舱风挡破裂安全备降成都】今天上午6:26从重庆江北国际机场出发前往拉萨的四川...   \n",
       "4  3966337217183260                   支持郑强！！！//【贵州大学校长回应空姐言论:常给她们写感谢信】   \n",
       "\n",
       "                          picture_lists category  ncw_label  fake_label  \\\n",
       "0                                   NaN     文体娱乐          0           0   \n",
       "1  63ad082a189566eed7c4bb3e4bc55012.jpg     社会生活          0           0   \n",
       "2  4986dc2a5f09a87c7af5dfc57d7775cd.jpg     社会生活          0           0   \n",
       "3  dcfccfc69e90a0007afd6aafa1385e56.jpg     社会生活          0           0   \n",
       "4                                   NaN     社会生活          0           0   \n",
       "\n",
       "   real_label comment_2c comment_all  \n",
       "0           1        NaN         NaN  \n",
       "1           1        NaN         NaN  \n",
       "2           1        NaN         NaN  \n",
       "3           1        NaN         NaN  \n",
       "4           1        NaN         NaN  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "group = df.groupby(['category'])\n",
    "df1 = group.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>ncw_label</th>\n",
       "      <th>fake_label</th>\n",
       "      <th>real_label</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>军事</th>\n",
       "      <td>1978</td>\n",
       "      <td>1978</td>\n",
       "      <td>1149</td>\n",
       "      <td>1978</td>\n",
       "      <td>1978</td>\n",
       "      <td>1978</td>\n",
       "      <td>956</td>\n",
       "      <td>956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>医药健康</th>\n",
       "      <td>8123</td>\n",
       "      <td>8123</td>\n",
       "      <td>4678</td>\n",
       "      <td>8123</td>\n",
       "      <td>8123</td>\n",
       "      <td>8123</td>\n",
       "      <td>3379</td>\n",
       "      <td>3379</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>政治</th>\n",
       "      <td>1544</td>\n",
       "      <td>1544</td>\n",
       "      <td>1026</td>\n",
       "      <td>1544</td>\n",
       "      <td>1544</td>\n",
       "      <td>1544</td>\n",
       "      <td>533</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>教育考试</th>\n",
       "      <td>1002</td>\n",
       "      <td>1002</td>\n",
       "      <td>452</td>\n",
       "      <td>1002</td>\n",
       "      <td>1002</td>\n",
       "      <td>1002</td>\n",
       "      <td>500</td>\n",
       "      <td>500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>文体娱乐</th>\n",
       "      <td>10232</td>\n",
       "      <td>10232</td>\n",
       "      <td>6026</td>\n",
       "      <td>10232</td>\n",
       "      <td>10232</td>\n",
       "      <td>10232</td>\n",
       "      <td>4727</td>\n",
       "      <td>4727</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>社会生活</th>\n",
       "      <td>22258</td>\n",
       "      <td>22258</td>\n",
       "      <td>13377</td>\n",
       "      <td>22258</td>\n",
       "      <td>22258</td>\n",
       "      <td>22258</td>\n",
       "      <td>10360</td>\n",
       "      <td>10360</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>科技</th>\n",
       "      <td>2506</td>\n",
       "      <td>2506</td>\n",
       "      <td>1553</td>\n",
       "      <td>2506</td>\n",
       "      <td>2506</td>\n",
       "      <td>2506</td>\n",
       "      <td>1212</td>\n",
       "      <td>1212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>财经商业</th>\n",
       "      <td>2267</td>\n",
       "      <td>2267</td>\n",
       "      <td>1364</td>\n",
       "      <td>2267</td>\n",
       "      <td>2267</td>\n",
       "      <td>2267</td>\n",
       "      <td>1022</td>\n",
       "      <td>1022</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             id  content  picture_lists  ncw_label  fake_label  real_label  \\\n",
       "category                                                                     \n",
       "军事         1978     1978           1149       1978        1978        1978   \n",
       "医药健康       8123     8123           4678       8123        8123        8123   \n",
       "政治         1544     1544           1026       1544        1544        1544   \n",
       "教育考试       1002     1002            452       1002        1002        1002   \n",
       "文体娱乐      10232    10232           6026      10232       10232       10232   \n",
       "社会生活      22258    22258          13377      22258       22258       22258   \n",
       "科技         2506     2506           1553       2506        2506        2506   \n",
       "财经商业       2267     2267           1364       2267        2267        2267   \n",
       "\n",
       "          comment_2c  comment_all  \n",
       "category                           \n",
       "军事               956          956  \n",
       "医药健康            3379         3379  \n",
       "政治               533          533  \n",
       "教育考试             500          500  \n",
       "文体娱乐            4727         4727  \n",
       "社会生活           10360        10360  \n",
       "科技              1212         1212  \n",
       "财经商业            1022         1022  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ncw_label</th>\n",
       "      <th>fake_label</th>\n",
       "      <th>real_label</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>category</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>军事</th>\n",
       "      <td>0.811931</td>\n",
       "      <td>0.076340</td>\n",
       "      <td>0.111729</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>医药健康</th>\n",
       "      <td>0.221962</td>\n",
       "      <td>0.408839</td>\n",
       "      <td>0.369199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>政治</th>\n",
       "      <td>0.196244</td>\n",
       "      <td>0.369819</td>\n",
       "      <td>0.433938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>教育考试</th>\n",
       "      <td>0.100798</td>\n",
       "      <td>0.507984</td>\n",
       "      <td>0.391218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>文体娱乐</th>\n",
       "      <td>0.743941</td>\n",
       "      <td>0.123827</td>\n",
       "      <td>0.132232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>社会生活</th>\n",
       "      <td>0.073771</td>\n",
       "      <td>0.458397</td>\n",
       "      <td>0.467832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>科技</th>\n",
       "      <td>0.887869</td>\n",
       "      <td>0.050279</td>\n",
       "      <td>0.061852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>财经商业</th>\n",
       "      <td>0.358183</td>\n",
       "      <td>0.305690</td>\n",
       "      <td>0.336127</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          ncw_label  fake_label  real_label\n",
       "category                                   \n",
       "军事         0.811931    0.076340    0.111729\n",
       "医药健康       0.221962    0.408839    0.369199\n",
       "政治         0.196244    0.369819    0.433938\n",
       "教育考试       0.100798    0.507984    0.391218\n",
       "文体娱乐       0.743941    0.123827    0.132232\n",
       "社会生活       0.073771    0.458397    0.467832\n",
       "科技         0.887869    0.050279    0.061852\n",
       "财经商业       0.358183    0.305690    0.336127"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group.sum()/group.count()[['ncw_label', 'fake_label', 'real_label']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "group2 = df.groupby(['ncw_label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "ncw = group2.count()[[\"content\", \"picture_lists\", \"comment_2c\", \"comment_all\"]]#.apply(lambda x: x['picture_lists']*1.0/x['content'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ncw_label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>33806</td>\n",
       "      <td>20067</td>\n",
       "      <td>15175</td>\n",
       "      <td>15175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16104</td>\n",
       "      <td>9558</td>\n",
       "      <td>7514</td>\n",
       "      <td>7514</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           content  picture_lists  comment_2c  comment_all\n",
       "ncw_label                                                 \n",
       "0            33806          20067       15175        15175\n",
       "1            16104           9558        7514         7514"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ncw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in ncw.columns[1:]:\n",
    "    ncw['%s'%col] = ncw[\"%s\"%col]*1.0/ncw[\"content\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ncw_label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>33806</td>\n",
       "      <td>0.593593</td>\n",
       "      <td>0.448885</td>\n",
       "      <td>0.448885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16104</td>\n",
       "      <td>0.593517</td>\n",
       "      <td>0.466592</td>\n",
       "      <td>0.466592</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           content  picture_lists  comment_2c  comment_all\n",
       "ncw_label                                                 \n",
       "0            33806       0.593593    0.448885     0.448885\n",
       "1            16104       0.593517    0.466592     0.466592"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ncw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>real_label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>32945</td>\n",
       "      <td>0.599666</td>\n",
       "      <td>0.514342</td>\n",
       "      <td>0.514342</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16965</td>\n",
       "      <td>0.581727</td>\n",
       "      <td>0.338579</td>\n",
       "      <td>0.338579</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            content  picture_lists  comment_2c  comment_all\n",
       "real_label                                                 \n",
       "0             32945       0.599666    0.514342     0.514342\n",
       "1             16965       0.581727    0.338579     0.338579"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group3 = df.groupby(['real_label'])\n",
    "\n",
    "real = group3.count()[[\"content\", \"picture_lists\", \"comment_2c\", \"comment_all\"]]#.apply(lambda x: x['picture_lists']*1.0/x['content'])\n",
    "\n",
    "for col in real.columns[1:]:\n",
    "    real['%s'%col] = real[\"%s\"%col]*1.0/real[\"content\"]\n",
    "real"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "      <th>picture_lists</th>\n",
       "      <th>comment_2c</th>\n",
       "      <th>comment_all</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fake_label</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>33069</td>\n",
       "      <td>0.587469</td>\n",
       "      <td>0.400919</td>\n",
       "      <td>0.400919</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16841</td>\n",
       "      <td>0.605546</td>\n",
       "      <td>0.560002</td>\n",
       "      <td>0.560002</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            content  picture_lists  comment_2c  comment_all\n",
       "fake_label                                                 \n",
       "0             33069       0.587469    0.400919     0.400919\n",
       "1             16841       0.605546    0.560002     0.560002"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group3 = df.groupby(['fake_label'])\n",
    "\n",
    "real = group3.count()[[\"content\", \"picture_lists\", \"comment_2c\", \"comment_all\"]]#.apply(lambda x: x['picture_lists']*1.0/x['content'])\n",
    "\n",
    "for col in real.columns[1:]:\n",
    "    real['%s'%col] = real[\"%s\"%col]*1.0/real[\"content\"]\n",
    "real"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 数据调试分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataloader import ContentSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
      "  return f(*args, **kwds)\n",
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n",
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
      "  return f(*args, **kwds)\n"
     ]
    }
   ],
   "source": [
    "from train import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "    max_epoch = 10\n",
    "    best_valid_acc = 0.0\n",
    "    train_set = ContentSet(\"./data/train.csv\")\n",
    "    dev_set = ContentSet(\"./data/dev.csv\")\n",
    "    train_loader = DataLoader(train_set,batch_size=20, shuffle=True)\n",
    "    dev_loader = DataLoader(dev_set,batch_size=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<dataloader.ContentSet at 0x7f8f4e560f60>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dev_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "72"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dev_set.data[0][2].strip(\"\\t\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "d_len = [len(item[2].strip(\"\\t\"))for item in dev_set.data]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[72, 78, 74, 121, 141, 96, 38, 48, 189, 125]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d_len[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "122.56456211812628"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.mean(d_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hadoop/.conda/envs/torch_B/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 216, got 192\n",
      "  return f(*args, **kwds)\n"
     ]
    }
   ],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f8f46dce7f0>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAffElEQVR4nO3de5Bc5Z3e8e+vu6d7LpJG0kiw6AIzGPki7EpgFYxZx1UxsRGsYzkVqBWuzWodKqQqsGtv4kogLmOHDVUm2V3WjrGrWMMGs8aCYKd2sqs1axtcu96KBWPABgkEYxC6Io3mfu/p7l/+OKdHTdMz0zPTPX2k83yqVDr99ntOn/Nqph+973su5u6IiEj8JBq9AyIi0hgKABGRmFIAiIjElAJARCSmFAAiIjGVavQOLMaGDRu8s7Oz0bshInJO+fnPf37G3TeWl59TAdDZ2UlPT0+jd0NE5JxiZm9WKtcQkIhITCkARERiSgEgIhJTCgARkZhSAIiIxJQCQEQkphQAIiIxpQAQEYkpBYCISEydU1cCR92j+4/MLn/6gxc3cE9ERBamHoCISEwpAEREYkoBICISUwoAEZGYUgCIiMSUAkBEJKYUACIiMaUAEBGJKQWAiEhMKQBERGJKASAiElMKABGRmFIAiIjElAJARCSmqgoAM9tpZofMrNfM7qjwfsbMHgvf329mnWF5h5k9bWZjZvb1kvqtZvbXZvaKmR0ws6/U6oBERKQ6CwaAmSWB+4Hrge3AzWa2vazaLcCgu18G3AfcG5ZPAV8EPl9h03/k7u8FrgB+w8yuX9ohiIjIUlTTA7gK6HX31909C+wFdpXV2QU8HC4/AVxrZubu4+7+U4IgmOXuE+7+dLicBZ4DtizjOEREZJGqCYDNwNGS18fCsop13D0HDAMd1eyAma0F/gXw4znev9XMesysp6+vr5pNiohIFRo6CWxmKeC7wNfc/fVKddz9AXff4e47Nm7cuLI7KCJyHqsmAI4DW0tebwnLKtYJv9Tbgf4qtv0A8Jq7/2kVdUVEpIaqCYBngW1m1mVmaWA30F1WpxvYEy7fCDzl7j7fRs3svxEExecWt8siIlILqYUquHvOzG4HngSSwEPufsDM7gZ63L0beBB4xMx6gQGCkADAzA4Da4C0mX0K+DgwAnwBeAV4zswAvu7u36rlwYmIyNwWDAAAd98H7Csru6tkeQq4aY51O+fYrFW3iyIiUg+6ElhEJKYUACIiMaUAEBGJKQWAiEhMKQBERGJKASAiElMKABGRmFIAiIjElAJARCSmFAAiIjGlABARiSkFgIhITCkARERiSgEgIhJTCgARkZhSAIiIxJQCQEQkphQAIiIxpQAQEYkpBYCISEwpAEREYkoBICISUwoAEZGYqioAzGynmR0ys14zu6PC+xkzeyx8f7+ZdYblHWb2tJmNmdnXy9b5dTN7MVzna2ZmtTggERGpzoIBYGZJ4H7gemA7cLOZbS+rdgsw6O6XAfcB94blU8AXgc9X2PQ3gX8LbAv/7FzKAYiIyNJU0wO4Cuh199fdPQvsBXaV1dkFPBwuPwFca2bm7uPu/lOCIJhlZhcBa9z9Z+7uwLeBTy3nQEREZHGqCYDNwNGS18fCsop13D0HDAMdC2zz2ALbBMDMbjWzHjPr6evrq2J3RUSkGpGfBHb3B9x9h7vv2LhxY6N3R0TkvFFNABwHtpa83hKWVaxjZimgHehfYJtbFtimiIjUUTUB8Cywzcy6zCwN7Aa6y+p0A3vC5RuBp8Kx/Yrc/SQwYmZXh2f//A7wl4veexERWbLUQhXcPWdmtwNPAkngIXc/YGZ3Az3u3g08CDxiZr3AAEFIAGBmh4E1QNrMPgV83N0PAv8e+F9AC/A34R8REVkhCwYAgLvvA/aVld1VsjwF3DTHup1zlPcA7692R0VEpLYiPwksIiL1oQAQEYkpBYCISEwpAEREYkoBICISUwqAGhubznHvD17hxWPDjd4VEZF5KQBq7OTQJMOTM7x8cqTRuyIiMi8FQI0NTGQBGJmaafCeiIjMTwFQY4PjxQDINXhPRETmpwCosYEwAEbVAxCRiFMA1NjsENCkegAiEm0KgBpTD0BEzhUKgBqazOaZmikAmgQWkehTANRQ8X//CYNRTQKLSMQpAGqoOP6/cXVGASAikacAqKFiD2BTe4uGgEQk8hQANTQwnqU1naS9pYnRqRzzPBVTRKThFAA1NDiRZX1bmuamJPmCM5HNN3qXRETmpACooYHxLOtagwAATQSLSLQpAGokly8wNNsDCJpV1wKISJQpAGrk5PAUBWd2CAh0LYCIRFuq0Ttwvjg6MAEEAdCUDHJVN4QTkSirqgdgZjvN7JCZ9ZrZHRXez5jZY+H7+82ss+S9O8PyQ2Z2XUn5H5jZATN7ycy+a2bNtTigRjlSDIDWs0NAI5PqAYhIdC0YAGaWBO4Hrge2Azeb2fayarcAg+5+GXAfcG+47nZgN3A5sBP4hpklzWwz8PvADnd/P5AM652zjgxMkDBY09KkSWAROSdU0wO4Cuh199fdPQvsBXaV1dkFPBwuPwFca2YWlu9192l3fwPoDbcHwfBTi5mlgFbgxPIOpbGODEywtjVNMmE0pxQAIhJ91QTAZuBoyetjYVnFOu6eA4aBjrnWdffjwB8BR4CTwLC7/22lDzezW82sx8x6+vr6qtjdxjg6MMH6tjQATUmjKWmaBBaRSGvIWUBmto6gd9AFbALazOy3K9V19wfcfYe779i4ceNK7uaiHB2cZH1rEABmxurmJp0GKiKRVk0AHAe2lrzeEpZVrBMO6bQD/fOs+8+BN9y9z91ngO8D1yzlAKJgdGqGgfHsbA8AwIAXjw3z6P4jjdsxEZF5VBMAzwLbzKzLzNIEk7XdZXW6gT3h8o3AUx7cCKcb2B2eJdQFbAOeIRj6udrMWsO5gmuBl5d/OI1xdGASgHUlAdDclJx9NoCISBQteB2Au+fM7HbgSYKzdR5y9wNmdjfQ4+7dwIPAI2bWCwwQntET1nscOAjkgNvcPQ/sN7MngOfC8ueBB2p/eCvj1MgUAO3NZ5sz05Rgakb3AhKR6KrqQjB33wfsKyu7q2R5CrhpjnXvAe6pUP4l4EuL2dmoOjM2DcCq5qbZspamJH1T043aJRGRBelWEDVQfA5AWyY5W9acSjKd0xCQiESXAqAG+sezNDclSCfPNmezhoBEJOIUADVwZmyajrYMwXx2oLkp6AEU9FAYEYkoBUAN9I9l6ViVfltZ8XYQ0zoTSEQiSgFQA/3j03S0VQ6ASQ0DiUhEKQBqIOgBZN5WVrwjqOYBRCSqFADL5O7zDgFN5RQAIhJNCoBlGpvOkc0X2NBW3gMIAyCrOQARiSYFwDL1jwXXAKwvnwNIhUNA6gGISEQpAJapfzy42rd8CKil2APQHICIRJQCYJnOhD2ADWWTwBkFgIhEnAJgmYpDQOU9gGQieCiM7ggqIlGlAFim/vBGcOVzABAMA6kHICJRpQBYpv7xLKubU2RSyXe8l1EAiEiEKQCWqX88+47x/6IWPRRGRCJMAbBM/WPTFYd/ILwjqE4DFZGIUgAsU/9Y9h33ASrKpDQEJCLRpQBYpv7x6XfcB6iopSnJpIaARCSiFADLkC84A+NZNqyaewhoWj0AEYkoBcAyDE1kKThzDgE1NyXJFVzDQCISSQqAZSg+C3iuIaDiDeFGp3Irtk8iItVSACxD8TYQc/cAguYdnZpZsX0SEalWVQFgZjvN7JCZ9ZrZHRXez5jZY+H7+82ss+S9O8PyQ2Z2XUn5WjN7wsxeMbOXzexDtTiglXT2RnDz9wBG1AMQkQhaMADMLAncD1wPbAduNrPtZdVuAQbd/TLgPuDecN3twG7gcmAn8I1wewBfBX7g7u8F/hHw8vIPZ2XNdR+gouZUcQhIPQARiZ5qegBXAb3u/rq7Z4G9wK6yOruAh8PlJ4BrzczC8r3uPu3ubwC9wFVm1g58BHgQwN2z7j60/MNZWf1j05jButY5AiCtOQARia5qAmAzcLTk9bGwrGIdd88Bw0DHPOt2AX3An5vZ82b2LTNrW9IRNNCZ8SzrW9MkE1bx/eJDYUYm1QMQkehp1CRwCrgS+Ka7XwGMA++YWwAws1vNrMfMevr6+lZyHxfUPzY95/APnJ0DGJtWD0BEoqeaADgObC15vSUsq1jHzFJAO9A/z7rHgGPuvj8sf4IgEN7B3R9w9x3uvmPjxo1V7O7KGRjPznkfIIB02ANQAIhIFFUTAM8C28ysy8zSBJO63WV1uoE94fKNwFPu7mH57vAsoS5gG/CMu78FHDWz94TrXAscXOaxrLj+sSwdqzI8uv8Ij+4/8o73ExY8FGZcASAiEZRaqIK758zsduBJIAk85O4HzOxuoMfduwkmcx8xs15ggCAkCOs9TvDlngNuc/fiZbG/B3wnDJXXgc/U+Njq7szYNBvm6QFAcEO4sWldCSwi0bNgAAC4+z5gX1nZXSXLU8BNc6x7D3BPhfIXgB2L2dkoyeYKjEzl5rwGoCiTSqgHICKRpCuBl+jsbSAW6gEoAEQkmhQAS3QmfBZwR9v8PYB0KqFJYBGJJAXAEvVX3QNIMp5VAIhI9CgAlujVt0YBuKSjdd566VSCCU0Ci0gEKQCW6Pmjg2xZ18IFq5vnrZfREJCIRJQCYImee3OIKy9et2A9TQKLSFQpAJbgxNAkb41MccXFaxesm2lKMp7NUyj4CuyZiEj1FABL8PyR4Mal1fYAACb0WEgRiRgFwBI8d2SQTCrB+y5as2Dd4v2ANAwkIlGjAFiC544M8oHN7bNf7vPJpHRHUBGJJgXAIk3n8hw4PsKVlyw8/ANnh4DUAxCRqFEALNKBEyNk8wWurGICGM4GgHoAIhI1CoBFeu7NQQCuqGICGM4OAY3rYjARiZiq7gYqZz1/ZIjNa1v48cunq6qvISARiSr1ABbp+SODVZ3/X5Ru0hCQiESTAmAR3hqe4sTwVFXn/xepByAiUaUAWIRnDg8ALK4HkExgpgAQkehRACzCX/y/N9nU3sz7N7dXvY6Z0ZZO6bGQIhI5CoAqPXdkkGcOD3DFxev43z3HFrVuWyapHoCIRI7OAlrAo/uPAPCd/W/S0pRkR2f14/9FbZkUY3oojIhEjHoAVTgzOs3BEyN88NL1s+f1L8aqTEo9ABGJHAVAFf6+9wzJhPGhSzuWtH5bWgEgItGjAFjA6NQMzx8Z5MqL17G6uWlJ22jLaBJYRKKnqgAws51mdsjMes3sjgrvZ8zssfD9/WbWWfLenWH5ITO7rmy9pJk9b2Z/tdwDqZcXjg6RKzgf3rZhydvQJLCIRNGCAWBmSeB+4HpgO3CzmW0vq3YLMOjulwH3AfeG624HdgOXAzuBb4TbK/os8PJyD6Kejg1Osq61iQ2rMkveRpvmAEQkgqrpAVwF9Lr76+6eBfYCu8rq7AIeDpefAK41MwvL97r7tLu/AfSG28PMtgC/CXxr+YdRPyeGJtm0tmVZ21iVSelWECISOdUEwGbgaMnrY2FZxTrungOGgY4F1v1T4D8Bhfk+3MxuNbMeM+vp6+urYndrZ2Rqhv7xLJuXGQBt6RTTuQK5/LyHKiKyohoyCWxmnwBOu/vPF6rr7g+4+w5337Fx48YV2LuzDp4YAVh2D6Ato1tCi0j0VHMh2HFga8nrLWFZpTrHzCwFtAP986z7SeCTZnYD0AysMbO/cPffXtJR1Fjx4q+f9p4B4KL25mVtb1UmaOaxbI721qWdSSQiUmvV9ACeBbaZWZeZpQkmdbvL6nQDe8LlG4Gn3N3D8t3hWUJdwDbgGXe/0923uHtnuL2novLlX+rE0CRrmlNLPv2zqC0MgAnNA4hIhCzYA3D3nJndDjwJJIGH3P2Amd0N9Lh7N/Ag8IiZ9QIDBF/qhPUeBw4COeA2dz9nxkFqMQEMJT0ABYCIREhV9wJy933AvrKyu0qWp4Cb5lj3HuCeebb9E+An1ezHSsrmCvSNTi/qzp9zKfYANAcgIlGiK4Hn8NbIFA5sal9+D6A4CawegIhEiQJgDieGJgHYtHZ5E8BwdghIF4OJSJQoAOZwYmiS1nSS9pbln7UzOwSkW0KLSIQoAOZwYmiSzWtbCC5oXh5NAotIFCkAKsjlC5wameaiGoz/Q/Bg+GTCNAQkIpGiAKjg1Og0efeajP9D8bnASZ0FJCKRogCooDgBvNx7AJXSDeFEJGoUABWcGpmiKWmsa0vXbJu6JbSIRI0CoIKhiRnWtaZJ1GACuKhNPQARiRgFQAXDkzOsrfFN2/RgeBGJGgVABUOTM7S31G74B4qPhdQksIhEhwKgzNRMnvHpXE0uACulISARiRoFQJmTw1MA9RkC0pXAIhIhCoAyxVNA19ahB6A5ABGJEgVAmePFAGit8RxAOslM3pnOaR5ARKJBAVDmxNAkBqxprupRCVXTMwFEJGoUAGVODE2yqjlFKlnbpmnTLaFFJGIUAGVODk/VfPwfdEdQEYkeBUCZ40OTNT8FFNQDEJHoUQCUcHdODE3WfAIYYJUeCykiEaMAKDE4McPUTKHOPQBNAotINCgASsxeA1Dji8AA2tIaAhKRaKkqAMxsp5kdMrNeM7ujwvsZM3ssfH+/mXWWvHdnWH7IzK4Ly7aa2dNmdtDMDpjZZ2t1QMsxew1Aje8DBJoEFpHoWTAAzCwJ3A9cD2wHbjaz7WXVbgEG3f0y4D7g3nDd7cBu4HJgJ/CNcHs54D+6+3bgauC2CttccSfDAGivRw8gDICRqZmab1tEZCmq6QFcBfS6++vungX2ArvK6uwCHg6XnwCuteBp6ruAve4+7e5vAL3AVe5+0t2fA3D3UeBlYPPyD2d5TgxPkUklaEsna77tdCrBBaszHB2YrPm2RUSWopoA2AwcLXl9jHd+Wc/WcfccMAx0VLNuOFx0BbC/0oeb2a1m1mNmPX19fVXs7tIdH5pk09oWrIYPginVuaGNw/3jddm2iMhiNXQS2MxWAd8DPufuI5XquPsD7r7D3Xds3LixrvtzYmiyZg+Cr6Sro43DZxQAIhIN1QTAcWBryestYVnFOmaWAtqB/vnWNbMmgi//77j795ey87V2YmiSTe21exB8uc4NbfSPZzUPICKRUE0APAtsM7MuM0sTTOp2l9XpBvaEyzcCT7m7h+W7w7OEuoBtwDPh/MCDwMvu/ie1OJDlyuYKnB6dZtPa+gVA14ZWAPUCRCQSFgyAcEz/duBJgsnax939gJndbWafDKs9CHSYWS/wH4A7wnUPAI8DB4EfALe5ex74DeBfAx81sxfCPzfU+NgW5dTIFO6wua4BsAqANxQAIhIBVd3z2N33AfvKyu4qWZ4Cbppj3XuAe8rKfgrUZ6Z1iYoXgW1a28KRgYm6fMYlHcUeQH22LyKyGLoSOHRiuBgA9ZsEbm5Ksqm9WWcCiUgkKABCJ4aCZwFfVMdJYAgmgjUEJCJRoAAIHRucZH1bmpY6XARWStcCiEhUKABCvzo9xrs2ttX9c7o62hiamGFoIlv3zxIRmY8CgOA5AC8eH8bMeHT/kbp+VueGIGQ0DCQijaYAAE6PTjM5k+fCNfWbAC6avRZAw0Ai0mAKAODQW6MAXLgmU/fP2rq+lYTBG30KABFprKquAzjfvXoqDIDV9ekBlA4rffqDF7N5XQtv9OtaABFpLPUACHoAqzKp2Xv211unbgonIhGgAABePT22IsM/RV0bggAIbpckItIYsQ+AQsF57dToikwAF3V2tDE6naN/XKeCikjjxD4Ajg9NMpFdmTOAirrCU0E1DCQijRT7ADh7BtAK9gB0LYCIREDszwI6FJ4BdMHqlZkDeHT/EfIFJ2Fw8GTFh6CJiKyI2PcAXj01yua1LTQ31fceQKWSCePyTe08uv8IR+t062kRkYUoAE6N8e4LV634597wgYtIJYwvdR/Q2UAi0hCxDoBcvsCvTo/x7l9bveKf3d7SxB987N089cppfnjw1Ip/vohIrAPgcP8E2XyB91y48gEAsOeaTt77a6v5r//3IBPZXEP2QUTiK9YBULwFxLsbFABNyQR/+Kn3c3xokv/8vRcVAiKyomIdAIfeGiVhcNkFKz8HAMEZQa+dGuNj2y/kr355gk/8z5/y0vHhhuyLiMRPbAPA3fmH3jN0drSt6BlAlfyz91zAZ67p4szoNLu+/g989UevMTat3oCI1FdsA+CxZ4/S8+Ygt/zTrkbvChD0Qn7vo9t430Wrue9Hr/Lhe5/i/qd7FQQiUjexvBDs5PAk9/z1y3zo0g5u/icXN3p3ZrVlUnz6g5dwdGCCH79yiv/x5CG+9uPXuOZdHXz0fRfy4cs2cPH6VpIJa/Suish5oKoAMLOdwFeBJPAtd/9K2fsZ4NvArwP9wG+5++HwvTuBW4A88Pvu/mQ126wXd+e/fP9FcgXnK//qAyQi+GW6dX0rv3tNF0cHJvjlsSF+cWyYpw/1AdDclOCyC1bxro2r2LS2hU3tzVy4ppmOVWnWt2VY35pmdXMqksclItGyYACYWRK4H/gYcAx41sy63f1gSbVbgEF3v8zMdgP3Ar9lZtuB3cDlwCbgR2b27nCdhbZZU8OTMxw+M85PDvXx9KE+vviJ7VzSUf+HwC/H1vWtbF3fyg0fcPrGpjk6MMGpkWlOjUzxd6/2MTKZI1/hIrKEBdcZrA3DYE1zE6syKTJNCZqSCdKpBE0JI5UMXrc0JWlNJ2lJh383JWluSpJMGGZgGHl3cvkCM3lnOpdnbDrH+HSOsek84+FyvuA0NwXbaW5Kzm43kTBGJmcYmZpheGKGwYksgxMzTGbzrG1tYsOqDBtWpdm8toUt61rZvK6F1c0p2tIpWtJJ0snEnIHm7uQKTr7guPO29nB3HPACFMLl0jZqCo8/VbLtgjszeWemUGAmVyBXcGbyBfIFp+DB+wCp2fYzMskkmaYE6WQCh7CuM50rMJ3LMz1TmN3HfMExg4QZyYSRShiZVPBvErS3YYAZlP7TJsxIJML1zN7WHoXw8/IetkEhWM7ng3IzI2FgZjQljXQyQSpZ/ehv/m1tcLYdUwkjlQjab77/cHjYpvlC0K6FgmMYGLNtkQh/zhIJSIZtYzb/f2I8PN4iC4/xXFIonG3P4r/7Sh5DNT2Aq4Bed38dwMz2AruA0i/rXcCXw+UngK9bcBS7gL3uPg28YWa94faoYps185tf+3sOnDh7352rL13P717TWY+Pqgsz44LVzVxQ9sSygjtj0zlGJ3OMZ4Mv4fFsnslsjolsnsmZPFMzeYYmZpiayc9+CeUKTqH4JRGWLUc6mSCTSmAGM3knVwiColwqYbQ0FYMmRTplHD6T5aXjw4xO5cjNsx9mwfrB1yM4PvulHFcJY8nHnzBmv2STFgQ9BKHjBO1aDNdqLlQv/vskSr683CFXKCx5H82YDTuD4IsyDOG8V96v0uOa/ULl7PEBrNTXa+nuFdvVndn/qMz1e5dMnP03Kd3/5774sZqfsFJNAGwGjpa8PgZ8cK467p4zs2GgIyz/Wdm6m8PlhbYJgJndCtwavhwzs0NV7PO83gQe+3fvKN4AnFnuts9jap+FqY3mp/ZZ2Jxt1PKHy9ruJZUKIz8J7O4PAA/U+3PMrMfdd9T7c85Vap+FqY3mp/ZZ2Eq3UTUDgceBrSWvt4RlFeuYWQpoJ5gMnmvdarYpIiJ1VE0APAtsM7MuM0sTTOp2l9XpBvaEyzcCT3lwi8tuYLeZZcysC9gGPFPlNkVEpI4WHAIKx/RvB54kOGXzIXc/YGZ3Az3u3g08CDwSTvIOEHyhE9Z7nGByNwfc5u55gErbrP3hLUrdh5nOcWqfhamN5qf2WdiKtpHpXvQiIvEU21tBiIjEnQJARCSmYh8AZrbTzA6ZWa+Z3dHo/WkkMztsZi+a2Qtm1hOWrTezH5rZa+Hf68JyM7Ovhe32SzO7srF7X3tm9pCZnTazl0rKFt0eZrYnrP+ame2p9Fnnqjna6Mtmdjz8OXrBzG4oee/OsI0Omdl1JeXn5e+hmW01s6fN7KCZHTCzz4bl0fg5Ci6njucfggnoXwGXAmngF8D2Ru9XA9vjMLChrOy/A3eEy3cA94bLNwB/Q3Bh5dXA/kbvfx3a4yPAlcBLS20PYD3wevj3unB5XaOPrc5t9GXg8xXqbg9/xzJAV/i7lzyffw+Bi4Arw+XVwKthO0Ti5yjuPYDZ21y4exYo3pJCztoFPBwuPwx8qqT82x74GbDWzC5qxA7Wi7v/HcFZbaUW2x7XAT909wF3HwR+COys/96vjDnaaC6zt4Zx9zeA4q1hztvfQ3c/6e7PhcujwMsEd0OIxM9R3AOg0m0uNs9RNw4c+Fsz+3l4Cw6AC939ZLj8FnBhuBzXtltse8S1nW4PhzAeKg5vEPM2MrNO4ApgPxH5OYp7AMjbfdjdrwSuB24zs4+UvulBX1TnDYfUHnP6JvAu4B8DJ4E/buzuNJ6ZrQK+B3zO3UdK32vkz1HcA0C3pCjh7sfDv08D/4ega36qOLQT/n06rB7Xtltse8Sundz9lLvn3b0A/Bln7wAcyzYysyaCL//vuPv3w+JI/BzFPQB0S4qQmbWZ2eriMvBx4CXefpuPPcBfhsvdwO+EZy1cDQyXdGnPZ4ttjyeBj5vZunAo5ONh2XmrbC7oXxL8HEEMbw1jZkZwp4SX3f1PSt6Kxs9Ro2fJG/2HYNb9VYKzEL7Q6P1pYDtcSnD2xS+AA8W2ILit94+B14AfAevDciN4qM+vgBeBHY0+hjq0yXcJhjBmCMZcb1lKewD/hmDCsxf4TKOPawXa6JGwDX4ZfqFdVFL/C2EbHQKuLyk/L38PgQ8TDO/8Engh/HNDVH6OdCsIEZGYivsQkIhIbCkARERiSgEgIhJTCgARkZhSAIiIxJQCQEQkphQAIiIx9f8B+v3QsFu6ugsAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.distplot(d_len, 100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
